Introduction

In professional sports, attaining the most amount of wins in a season is the ultimate goal. Player and team statistics are commonly used to project and predict the number of wins for an upcoming season. In this analysis, we will use team statistics from every Major League Baseball team from 1871 to 2006 to predict the number of wins for each team. We will address how we handled missing values, created new variables based on the data available to us, and transformed variables to help normalize the data. We will show how we selected our variables for each of the three multiple regression models, and compare the results of each before determining which model to use for our test data.

library(tidyverse)
library(gtExtras)
library(vtable)
library(kableExtra)
library(reactable)
library(GGally)
library(corrplot)
library(corrr)
library(caret)
library(leaps)
library(MASS)
library(mice)
library(dlookr)
library(VIM)
library(performance)
library(ggcorrplot)
library(rsample)
library(skimr)
library(performance)
library(car)
library(caret)
library(olsrr)
library(DataExplorer)
library(vip)

Data Exploration

mlb_training <- read.csv("https://raw.githubusercontent.com/moham6839/Data_621_HW1/main/moneyball-training-data.csv")
mlb_test <- read.csv("https://raw.githubusercontent.com/moham6839/Data_621_HW1/main/moneyball-evaluation-data.csv")

Since INDEX is not a variable we will be using, we dropped it from the training and test sets:

mlb_training <- mlb_training %>%
  dplyr::select(-INDEX)
mlb_test <- mlb_test %>%
  dplyr::select(-INDEX)

Glimpse and Summary of MLB Training Set

reactable(mlb_training)
mlb_training %>%
  glimpse()
## Rows: 2,276
## Columns: 16
## $ TARGET_WINS      <int> 39, 70, 86, 70, 82, 75, 80, 85, 86, 76, 78, 68, 72, 7…
## $ TEAM_BATTING_H   <int> 1445, 1339, 1377, 1387, 1297, 1279, 1244, 1273, 1391,…
## $ TEAM_BATTING_2B  <int> 194, 219, 232, 209, 186, 200, 179, 171, 197, 213, 179…
## $ TEAM_BATTING_3B  <int> 39, 22, 35, 38, 27, 36, 54, 37, 40, 18, 27, 31, 41, 2…
## $ TEAM_BATTING_HR  <int> 13, 190, 137, 96, 102, 92, 122, 115, 114, 96, 82, 95,…
## $ TEAM_BATTING_BB  <int> 143, 685, 602, 451, 472, 443, 525, 456, 447, 441, 374…
## $ TEAM_BATTING_SO  <int> 842, 1075, 917, 922, 920, 973, 1062, 1027, 922, 827, …
## $ TEAM_BASERUN_SB  <int> NA, 37, 46, 43, 49, 107, 80, 40, 69, 72, 60, 119, 221…
## $ TEAM_BASERUN_CS  <int> NA, 28, 27, 30, 39, 59, 54, 36, 27, 34, 39, 79, 109, …
## $ TEAM_BATTING_HBP <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TEAM_PITCHING_H  <int> 9364, 1347, 1377, 1396, 1297, 1279, 1244, 1281, 1391,…
## $ TEAM_PITCHING_HR <int> 84, 191, 137, 97, 102, 92, 122, 116, 114, 96, 86, 95,…
## $ TEAM_PITCHING_BB <int> 927, 689, 602, 454, 472, 443, 525, 459, 447, 441, 391…
## $ TEAM_PITCHING_SO <int> 5456, 1082, 917, 928, 920, 973, 1062, 1033, 922, 827,…
## $ TEAM_FIELDING_E  <int> 1011, 193, 175, 164, 138, 123, 136, 112, 127, 131, 11…
## $ TEAM_FIELDING_DP <int> NA, 155, 153, 156, 168, 149, 186, 136, 169, 159, 141,…
  #kable() %>%
  #kable_styling()
mlb_training %>%
  summary() %>%
  kable() %>%
  kable_styling()
TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
Min. : 0.00 Min. : 891 Min. : 69.0 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0 Min. : 52.0
1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0 1st Qu.:131.0
Median : 82.00 Median :1454 Median :238.0 Median : 47.00 Median :102.00 Median :512.0 Median : 750.0 Median :101.0 Median : 49.0 Median :58.00 Median : 1518 Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0 Median :149.0
Mean : 80.79 Mean :1469 Mean :241.2 Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6 Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779 Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5 Mean :146.4
3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2 3rd Qu.:164.0
Max. :146.00 Max. :2554 Max. :458.0 Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0 Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132 Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0 Max. :228.0
NA NA NA NA NA NA NA’s :102 NA’s :131 NA’s :772 NA’s :2085 NA NA NA NA’s :102 NA NA’s :286
skim(mlb_training)
Data summary
Name mlb_training
Number of rows 2276
Number of columns 16
_______________________
Column type frequency:
numeric 16
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TARGET_WINS 0 1.00 80.79 15.75 0 71.0 82.0 92.00 146 ▁▁▇▅▁
TEAM_BATTING_H 0 1.00 1469.27 144.59 891 1383.0 1454.0 1537.25 2554 ▁▇▂▁▁
TEAM_BATTING_2B 0 1.00 241.25 46.80 69 208.0 238.0 273.00 458 ▁▆▇▂▁
TEAM_BATTING_3B 0 1.00 55.25 27.94 0 34.0 47.0 72.00 223 ▇▇▂▁▁
TEAM_BATTING_HR 0 1.00 99.61 60.55 0 42.0 102.0 147.00 264 ▇▆▇▅▁
TEAM_BATTING_BB 0 1.00 501.56 122.67 0 451.0 512.0 580.00 878 ▁▁▇▇▁
TEAM_BATTING_SO 102 0.96 735.61 248.53 0 548.0 750.0 930.00 1399 ▁▆▇▇▁
TEAM_BASERUN_SB 131 0.94 124.76 87.79 0 66.0 101.0 156.00 697 ▇▃▁▁▁
TEAM_BASERUN_CS 772 0.66 52.80 22.96 0 38.0 49.0 62.00 201 ▃▇▁▁▁
TEAM_BATTING_HBP 2085 0.08 59.36 12.97 29 50.5 58.0 67.00 95 ▂▇▇▅▁
TEAM_PITCHING_H 0 1.00 1779.21 1406.84 1137 1419.0 1518.0 1682.50 30132 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 105.70 61.30 0 50.0 107.0 150.00 343 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 553.01 166.36 0 476.0 536.5 611.00 3645 ▇▁▁▁▁
TEAM_PITCHING_SO 102 0.96 817.73 553.09 0 615.0 813.5 968.00 19278 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 246.48 227.77 65 127.0 159.0 249.25 1898 ▇▁▁▁▁
TEAM_FIELDING_DP 286 0.87 146.39 26.23 52 131.0 149.0 164.00 228 ▁▂▇▆▁

Glimpse and Summary of MLB Test Set

reactable(mlb_test)
mlb_test %>%
  glimpse() %>%
  kable() %>%
  kable_styling()
## Rows: 259
## Columns: 15
## $ TEAM_BATTING_H   <int> 1209, 1221, 1395, 1539, 1445, 1431, 1430, 1385, 1259,…
## $ TEAM_BATTING_2B  <int> 170, 151, 183, 309, 203, 236, 219, 158, 177, 212, 243…
## $ TEAM_BATTING_3B  <int> 33, 29, 29, 29, 68, 53, 55, 42, 78, 42, 40, 55, 57, 2…
## $ TEAM_BATTING_HR  <int> 83, 88, 93, 159, 5, 10, 37, 33, 23, 58, 50, 164, 186,…
## $ TEAM_BATTING_BB  <int> 447, 516, 509, 486, 95, 215, 568, 356, 466, 452, 495,…
## $ TEAM_BATTING_SO  <int> 1080, 929, 816, 914, 416, 377, 527, 609, 689, 584, 64…
## $ TEAM_BASERUN_SB  <int> 62, 54, 59, 148, NA, NA, 365, 185, 150, 52, 64, 48, 3…
## $ TEAM_BASERUN_CS  <int> 50, 39, 47, 57, NA, NA, NA, NA, NA, NA, NA, 28, 21, 8…
## $ TEAM_BATTING_HBP <int> NA, NA, NA, 42, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ TEAM_PITCHING_H  <int> 1209, 1221, 1395, 1539, 3902, 2793, 1544, 1626, 1342,…
## $ TEAM_PITCHING_HR <int> 83, 88, 93, 159, 14, 20, 40, 39, 25, 62, 53, 173, 196…
## $ TEAM_PITCHING_BB <int> 447, 516, 509, 486, 257, 420, 613, 418, 497, 482, 521…
## $ TEAM_PITCHING_SO <int> 1080, 929, 816, 914, 1123, 736, 569, 715, 734, 622, 6…
## $ TEAM_FIELDING_E  <int> 140, 135, 156, 124, 616, 572, 490, 328, 226, 184, 200…
## $ TEAM_FIELDING_DP <int> 156, 164, 153, 154, 130, 105, NA, 104, 132, 145, 183,…
TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
1209 170 33 83 447 1080 62 50 NA 1209 83 447 1080 140 156
1221 151 29 88 516 929 54 39 NA 1221 88 516 929 135 164
1395 183 29 93 509 816 59 47 NA 1395 93 509 816 156 153
1539 309 29 159 486 914 148 57 42 1539 159 486 914 124 154
1445 203 68 5 95 416 NA NA NA 3902 14 257 1123 616 130
1431 236 53 10 215 377 NA NA NA 2793 20 420 736 572 105
1430 219 55 37 568 527 365 NA NA 1544 40 613 569 490 NA
1385 158 42 33 356 609 185 NA NA 1626 39 418 715 328 104
1259 177 78 23 466 689 150 NA NA 1342 25 497 734 226 132
1397 212 42 58 452 584 52 NA NA 1489 62 482 622 184 145
1427 243 40 50 495 640 64 NA NA 1501 53 521 673 200 183
1496 239 55 164 462 670 48 28 NA 1574 173 486 705 150 178
1420 223 57 186 511 751 31 21 NA 1494 196 538 790 137 167
1460 232 22 176 503 680 27 8 NA 1536 185 529 715 125 160
1411 195 22 141 485 665 59 48 NA 1411 141 485 665 115 114
1434 192 30 153 434 747 57 46 NA 1434 153 434 747 146 180
1297 204 22 130 491 1008 84 55 NA 1313 132 497 1021 154 126
1446 284 25 166 565 1041 77 39 55 1464 168 572 1054 115 172
1276 162 52 17 383 NA 138 NA NA 1351 18 406 NA 301 83
1715 322 72 116 527 397 90 83 NA 1816 123 558 420 232 174
1520 295 68 49 628 459 77 49 NA 1620 52 669 489 166 158
1597 291 38 98 629 563 54 43 NA 1702 104 670 600 155 174
1453 256 67 105 653 651 40 41 NA 1559 113 701 698 179 153
1378 225 26 118 533 677 18 36 NA 1450 124 561 712 160 174
1516 277 24 152 431 902 89 36 54 1516 152 431 902 105 164
1556 288 20 164 474 878 121 32 73 1556 164 474 878 102 156
1499 183 28 3 83 0 NA NA NA 5167 10 286 0 1224 NA
1464 263 58 47 385 479 63 66 NA 1540 49 405 504 232 146
1558 318 66 32 634 439 83 64 NA 1639 34 667 462 218 130
1502 308 36 39 432 602 45 46 NA 1601 42 460 642 199 135
1596 320 58 130 718 596 70 54 NA 1679 137 755 627 178 146
1546 260 59 110 630 541 72 65 NA 1648 117 671 577 167 166
1516 282 53 115 723 695 47 38 NA 1595 121 761 731 146 174
1550 275 47 146 765 723 29 20 NA 1631 154 805 761 178 177
1447 260 54 148 532 935 39 33 NA 1465 150 539 947 130 154
1450 252 28 203 594 855 50 48 NA 1450 203 594 855 156 131
1347 239 36 130 546 897 69 31 NA 1408 136 571 938 136 147
1561 260 56 214 531 911 66 47 NA 1571 215 534 917 133 163
1578 252 26 135 567 780 48 47 NA 2367 203 851 1170 137 162
1598 259 45 181 500 842 38 25 NA 1598 181 500 842 143 128
1497 322 21 145 599 711 41 34 NA 1506 146 603 715 130 147
1569 310 39 124 623 728 65 36 NA 1569 124 623 728 93 123
1119 118 33 7 37 0 NA NA NA 4120 26 136 0 1568 NA
1609 196 120 62 781 599 536 NA NA 1931 74 937 719 470 NA
1514 175 70 80 615 612 392 NA NA 1803 95 733 729 413 NA
1657 237 119 41 593 334 325 NA NA 2114 52 756 426 537 NA
1746 213 106 69 526 429 324 NA NA 2176 86 655 535 500 NA
1319 224 70 56 416 677 176 131 NA 1397 59 440 717 284 100
1293 204 70 18 437 630 134 NA NA 1360 19 460 663 281 127
1420 235 70 36 450 443 121 136 NA 1494 38 473 466 237 118
1496 269 54 76 412 500 55 NA NA 1574 80 433 526 177 171
1625 289 38 80 517 486 72 NA NA 1709 84 544 511 154 164
1391 239 50 145 499 1041 70 49 NA 1391 145 499 1041 162 147
1319 203 43 130 415 854 41 30 NA 1319 130 415 854 119 149
1411 251 35 107 471 912 93 64 NA 1411 107 471 912 174 149
1420 221 41 104 417 816 77 51 NA 1420 104 417 816 114 142
1552 206 106 38 566 401 334 NA NA 1849 45 674 478 411 119
1280 203 72 15 392 616 227 NA NA 1346 16 412 648 250 100
1120 122 61 7 427 NA 194 NA NA 1186 7 452 NA 332 106
1390 183 84 18 445 NA 216 NA NA 1462 19 468 NA 304 107
1554 252 81 29 494 414 174 NA NA 1798 34 572 479 200 134
1410 218 69 45 738 627 65 58 NA 1483 47 776 660 142 189
1507 262 28 159 573 907 107 52 NA 1516 160 577 913 126 132
1481 284 19 242 499 1030 78 51 63 1481 242 499 1030 100 167
1450 253 23 200 435 1002 137 67 79 1450 200 435 1002 94 166
1637 260 93 26 487 288 446 NA NA 2088 33 621 367 321 NA
1436 202 82 44 376 681 160 NA NA 1674 51 438 794 414 119
1600 218 89 21 344 538 152 NA NA 1851 24 398 623 373 137
1348 168 76 23 506 NA 296 NA NA 1427 24 536 NA 327 127
1460 191 111 22 612 629 306 NA NA 1546 23 648 666 314 114
1621 255 126 37 478 350 54 NA NA 1705 39 503 368 193 168
1433 241 49 45 468 501 52 NA NA 1507 47 492 527 127 203
1440 232 48 155 586 679 49 32 NA 1515 163 616 714 144 204
1479 211 34 232 555 799 47 23 NA 1556 244 584 841 119 155
1573 281 36 106 379 938 59 55 NA 1573 106 379 938 144 144
1558 224 42 171 474 1042 79 56 NA 1558 171 474 1042 168 158
1385 225 46 130 637 961 147 66 NA 1457 137 670 1011 116 150
1419 250 27 164 488 1006 124 56 NA 1419 164 488 1006 125 131
1284 198 61 19 383 NA 186 NA NA 1351 20 403 NA 270 100
1403 200 68 10 390 NA 201 NA NA 1495 11 416 NA 262 119
1631 358 48 105 553 455 55 34 NA 1716 110 582 479 179 173
1666 343 82 98 487 600 67 57 NA 1764 104 516 635 184 156
1804 376 86 129 541 494 69 56 NA 1898 136 569 520 191 162
1534 284 53 74 539 624 50 44 NA 1614 78 567 656 173 202
1472 222 52 156 659 788 48 41 NA 1548 164 693 829 163 148
1489 229 21 134 467 603 61 26 NA 1566 141 491 634 133 174
1367 198 21 156 506 857 109 46 NA 1367 156 506 857 114 127
1485 222 46 101 534 692 88 88 NA 1494 102 537 696 131 146
1458 225 32 109 651 625 151 68 NA 1458 109 651 625 123 129
1530 334 30 198 630 1061 143 60 NA 1530 198 630 1061 110 146
1421 160 72 30 523 508 289 NA NA 1731 37 637 619 445 NA
1869 301 122 58 347 127 399 NA NA 10814 336 2008 735 1261 NA
1400 169 66 26 431 344 156 NA NA 1680 31 517 413 398 133
1494 193 81 12 340 NA 207 NA NA 1614 13 367 NA 285 85
1449 223 62 20 423 NA 298 NA NA 1544 21 451 NA 286 93
1385 200 76 29 483 NA 262 NA NA 1457 31 508 NA 296 83
1443 218 99 24 716 554 254 154 NA 1518 25 753 583 271 113
1825 284 106 61 616 398 101 94 NA 1932 65 652 421 245 113
1627 296 95 38 630 445 93 76 NA 1712 40 663 468 207 159
1623 299 106 54 622 445 149 77 NA 1718 57 659 471 221 183
1556 298 82 60 500 550 72 53 NA 1637 63 526 579 187 176
1381 228 39 80 535 501 41 42 NA 1453 84 563 527 203 149
1556 272 46 114 532 634 32 37 NA 1637 120 560 667 138 157
1416 206 32 168 610 775 36 18 NA 1490 177 642 815 130 138
1413 257 21 204 546 1268 87 50 NA 1413 204 546 1268 135 157
1504 253 102 33 262 482 NA NA NA 2901 64 505 930 652 154
1193 165 68 45 299 1011 NA NA NA 1726 65 432 1462 743 NA
1461 325 30 166 470 1145 89 40 67 1461 166 470 1145 103 174
1458 294 36 187 590 999 89 30 61 1458 187 590 999 101 136
1295 237 64 25 360 814 129 NA NA 1734 33 482 1090 609 NA
1431 263 58 118 591 675 155 75 NA 1431 118 591 675 155 151
1469 305 59 98 498 644 216 84 NA 1469 98 498 644 150 153
1633 266 59 115 508 709 185 43 NA 1633 115 508 709 141 150
1603 295 58 132 442 758 133 48 NA 1603 132 442 758 127 140
1487 269 52 117 400 832 106 64 NA 1487 117 400 832 129 157
1474 318 44 101 501 884 108 62 NA 1483 102 504 889 123 162
1594 296 52 152 538 938 128 39 NA 1604 153 541 944 126 190
1415 285 42 140 524 921 140 65 52 1415 140 524 921 130 153
1445 289 34 126 424 1008 53 33 63 1445 126 424 1008 125 163
1362 199 81 29 408 508 386 NA NA 1576 34 472 588 581 NA
1572 195 106 30 522 288 297 NA NA 1721 33 571 315 344 NA
1209 168 56 16 435 NA 217 NA NA 1280 17 461 NA 363 92
1242 155 69 20 368 NA 132 NA NA 1359 22 403 NA 287 103
1098 116 63 29 340 NA 119 NA NA 1155 31 358 NA 254 69
1235 175 77 26 457 743 159 NA NA 1299 27 481 782 246 131
1651 247 80 59 357 335 83 63 NA 1737 62 376 352 219 146
1712 265 85 68 463 406 39 32 NA 1813 72 490 430 221 138
1391 206 78 41 390 523 112 NA NA 1473 43 413 554 239 124
1625 299 73 105 534 481 85 NA NA 1721 111 565 509 203 120
1740 319 77 128 506 569 56 NA NA 1830 135 532 599 178 176
1626 303 55 84 584 592 59 NA NA 1733 90 622 631 192 150
1471 277 36 65 602 509 85 NA NA 1547 68 633 535 145 158
1373 232 14 130 478 966 155 67 NA 1373 130 478 966 179 118
1466 215 35 158 527 1151 143 51 NA 1649 178 593 1295 146 135
1450 226 30 203 536 1092 102 41 62 1450 203 536 1092 73 145
1474 223 57 18 259 391 NA NA NA 2985 36 524 792 780 75
1335 228 49 120 500 909 106 75 NA 1335 120 500 909 127 168
1455 233 36 97 435 677 52 57 NA 1464 98 438 681 137 157
1477 272 35 82 511 779 256 115 NA 1477 82 511 779 89 146
1426 240 25 125 555 932 138 93 NA 1426 125 555 932 131 148
1255 183 61 11 304 814 161 NA NA 1346 12 326 873 336 104
1264 141 79 9 392 NA 181 NA NA 1347 10 418 NA 294 95
1695 310 89 66 610 421 110 44 NA 1795 70 646 446 193 173
1460 274 66 63 538 674 54 53 NA 1536 66 566 709 222 170
1349 237 46 53 610 639 50 39 NA 1419 56 642 672 137 160
1340 226 40 117 554 771 14 40 NA 1410 123 583 811 135 167
1396 257 42 150 554 969 92 33 NA 1396 150 554 969 172 158
1472 259 47 82 604 684 99 56 NA 1472 82 604 684 146 171
1544 256 46 112 526 693 66 45 NA 1544 112 526 693 134 203
1453 282 41 141 502 779 68 44 NA 1453 141 502 779 120 139
1446 257 39 196 501 977 81 61 NA 1446 196 501 977 118 168
1468 289 30 106 506 990 119 61 NA 1486 107 512 1002 93 146
1546 44 29 0 15 44 0 0 NA 22768 0 221 648 1473 NA
1372 195 31 103 353 932 36 31 NA 1372 103 353 932 166 154
1365 203 29 98 547 958 89 43 NA 1365 98 547 958 112 135
1314 172 26 112 436 1031 141 64 NA 1314 112 436 1031 151 171
1469 323 41 200 547 1071 146 35 62 1469 200 547 1071 104 131
1382 185 86 32 326 642 NA NA NA 2073 48 489 963 680 NA
1642 218 135 29 449 459 252 NA NA 2000 35 547 559 488 93
1324 153 65 17 437 NA 201 NA NA 1420 18 469 NA 352 101
1770 313 116 160 677 599 96 63 NA 1862 168 712 630 219 139
1765 293 83 164 792 587 146 72 NA 1869 174 839 622 177 139
1590 277 76 113 657 510 74 50 NA 1729 123 714 554 164 124
1775 334 88 193 741 629 82 42 NA 1879 204 785 666 173 157
1635 297 77 183 746 639 63 38 NA 1720 193 785 672 178 141
1557 264 79 146 655 503 25 25 NA 1638 154 689 529 126 169
1485 210 57 153 591 746 52 40 NA 1562 161 622 785 129 193
1461 229 41 152 515 597 66 47 NA 1479 154 521 604 124 185
1322 208 19 147 427 1027 119 45 NA 1322 147 427 1027 126 164
1462 281 18 163 536 903 78 37 NA 1462 163 536 903 112 165
1537 217 115 23 517 NA 275 NA NA 1638 25 551 NA 280 123
1495 236 85 35 565 579 234 NA NA 1583 37 598 613 224 114
1468 280 70 66 565 488 60 50 NA 1585 71 610 527 187 141
1689 296 74 59 580 343 103 66 NA 1777 62 610 361 204 130
1533 301 59 104 536 567 64 36 NA 1634 111 571 604 224 140
1379 229 55 64 636 592 39 35 NA 1451 67 669 623 150 169
1373 223 37 94 718 590 55 45 NA 1444 99 755 621 147 156
1394 215 43 118 505 765 42 32 NA 1466 124 531 805 175 197
1371 223 36 116 540 783 17 12 NA 1442 122 568 824 134 157
1400 210 28 148 617 953 100 39 NA 1400 148 617 953 137 162
1327 209 33 114 596 823 343 124 NA 1335 115 600 828 145 131
1432 263 33 199 593 1056 140 63 NA 1432 199 593 1056 142 122
1474 251 22 156 580 926 129 54 NA 1474 156 580 926 105 151
1450 279 28 205 609 1008 46 20 68 1450 205 609 1008 102 144
2025 292 140 32 259 70 259 NA NA 10935 173 1399 378 1172 NA
1669 281 102 35 391 473 580 NA NA 2033 43 476 576 643 NA
1631 291 79 52 650 604 307 NA NA 1987 63 792 736 566 NA
1420 299 79 5 233 587 NA NA NA 2347 8 385 970 1056 NA
1312 230 52 29 324 591 NA NA NA 1932 43 477 870 658 NA
2058 336 90 75 573 324 341 NA NA 2545 93 709 401 456 NA
1351 181 58 25 402 NA 169 NA NA 1440 27 428 NA 427 99
1452 199 87 17 433 NA 192 NA NA 1548 18 461 NA 293 106
1466 242 57 68 300 562 106 88 NA 1552 72 318 595 246 143
1534 256 44 64 406 511 59 NA NA 1635 68 433 545 195 166
1609 311 38 61 433 581 57 NA NA 1749 66 471 632 214 152
1344 207 28 59 472 527 57 NA NA 1414 62 497 554 246 158
1438 239 41 96 463 629 72 NA NA 1513 101 487 662 221 133
1368 225 53 139 686 708 46 34 NA 1439 146 722 745 116 123
1381 218 52 127 615 708 47 24 NA 1453 134 647 745 151 147
1498 250 59 130 603 916 54 35 NA 1576 137 634 964 136 143
1389 206 53 145 497 1098 46 32 NA 1398 146 500 1105 158 154
1448 224 49 117 510 969 56 42 NA 1448 117 510 969 113 147
1307 225 58 102 522 1073 72 64 NA 1315 103 525 1080 113 135
1517 250 38 104 563 654 156 70 NA 2297 157 852 990 130 136
1417 245 25 112 506 831 128 76 NA 1417 112 506 831 121 138
1352 209 45 125 640 906 143 75 NA 1352 125 640 906 152 117
1458 296 34 106 559 995 81 28 NA 1640 119 629 1119 108 156
1390 290 35 116 519 1032 92 56 NA 1390 116 519 1032 105 134
1475 257 80 52 515 573 284 NA NA 1810 64 632 703 471 NA
1378 178 85 35 512 604 246 NA NA 1654 42 614 725 570 NA
1817 277 155 60 541 259 319 NA NA 2264 75 674 323 441 NA
1711 213 133 29 418 375 195 NA NA 1860 32 454 408 392 NA
1415 217 112 52 552 613 168 NA NA 1489 55 581 645 243 138
1263 190 32 97 511 762 45 43 NA 1329 102 538 802 190 176
1328 221 63 96 495 686 23 23 NA 1397 101 521 722 175 184
1571 248 59 126 511 786 36 25 NA 1653 133 538 827 135 171
1522 235 70 130 444 871 66 34 NA 1522 130 444 871 137 195
1550 278 57 133 474 878 260 120 NA 1550 133 474 878 145 137
1412 237 33 98 438 841 96 62 NA 1412 98 438 841 128 142
1344 243 46 111 560 959 120 61 NA 1361 112 567 971 126 130
1441 276 30 141 513 1094 95 62 NA 1621 159 577 1231 136 155
1395 271 35 107 393 1060 159 51 NA 1395 107 393 1060 140 161
1506 320 31 168 564 1032 86 40 66 1506 168 564 1032 132 169
1437 269 39 143 418 1073 63 40 96 1446 144 421 1080 104 190
2170 241 70 13 111 102 92 76 NA 6893 41 353 324 1217 NA
1324 194 53 94 537 775 101 58 NA 1332 95 540 780 141 155
1442 239 25 136 484 917 96 68 NA 1442 136 484 917 135 135
1413 279 37 157 602 1177 131 53 46 1413 157 602 1177 141 155
1416 269 39 130 600 977 99 44 49 1416 130 600 977 109 136
1523 216 97 33 360 712 NA NA NA 2203 48 521 1030 743 NA
1294 169 51 24 546 NA 217 NA NA 1370 25 578 NA 244 79
1668 251 98 79 497 413 145 121 NA 1766 84 526 437 198 164
1422 215 53 140 660 662 44 NA NA 1496 147 694 696 144 190
1524 231 31 200 513 807 72 49 NA 1496 196 504 792 139 150
1392 227 41 134 568 842 90 59 NA 1392 134 568 842 178 136
1318 200 44 80 512 845 101 58 NA 1326 80 515 850 157 125
1499 229 26 112 528 980 126 76 NA 1499 112 528 980 169 134
1345 215 48 141 471 973 95 57 NA 1345 141 471 973 108 151
1620 210 139 66 542 355 233 NA NA 1988 81 665 436 523 NA
1339 185 80 34 413 579 149 NA NA 1682 43 519 727 276 146
1621 272 86 95 503 545 87 NA NA 1705 100 529 573 208 148
1585 288 62 105 572 498 39 NA NA 1667 110 602 524 118 170
1576 269 46 67 542 513 58 NA NA 1658 70 570 540 143 158
1541 300 49 101 451 781 117 54 NA 1541 101 451 781 122 174
1149 175 18 59 529 974 133 77 NA 1209 62 556 1025 175 155
1626 265 27 125 483 593 92 49 NA 1636 126 486 597 148 170
1461 228 29 121 423 812 82 50 NA 1470 122 426 817 139 139
1472 284 39 181 483 984 113 67 NA 1472 181 483 984 130 145
1366 218 39 99 451 649 28 52 NA 1374 100 454 653 131 164
1489 287 36 195 470 1094 156 55 74 1489 195 470 1094 97 184
1457 305 38 187 522 1142 71 18 53 1457 187 522 1142 107 159
1454 220 52 9 97 393 NA NA NA 3141 19 210 849 994 95
1642 221 98 56 638 451 319 NA NA 2031 69 789 558 492 NA
819 72 72 18 198 1107 NA NA NA 7371 162 1782 9963 936 NA
1251 162 23 95 492 860 71 69 NA 1299 99 511 893 139 146
1345 190 23 125 695 777 77 68 NA 1345 125 695 777 163 156
1381 263 37 102 463 976 196 63 NA 1381 102 463 976 124 113
1410 270 36 122 542 860 228 56 NA 1410 122 542 860 159 144
1423 339 34 172 420 1084 75 46 NA 1423 172 420 1084 131 150
mlb_test %>%
  summary() %>%
  kable() %>%
  kable_styling()
TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
Min. : 819 Min. : 44.0 Min. : 14.00 Min. : 0.00 Min. : 15.0 Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. :42.00 Min. : 1155 Min. : 0.0 Min. : 136.0 Min. : 0.0 Min. : 73.0 Min. : 69.0
1st Qu.:1387 1st Qu.:210.0 1st Qu.: 35.00 1st Qu.: 44.50 1st Qu.:436.5 1st Qu.: 545.0 1st Qu.: 59.0 1st Qu.: 38.00 1st Qu.:53.50 1st Qu.: 1426 1st Qu.: 52.0 1st Qu.: 471.0 1st Qu.: 613.0 1st Qu.: 131.0 1st Qu.:131.0
Median :1455 Median :239.0 Median : 52.00 Median :101.00 Median :509.0 Median : 686.0 Median : 92.0 Median : 49.50 Median :62.00 Median : 1515 Median :104.0 Median : 526.0 Median : 745.0 Median : 163.0 Median :148.0
Mean :1469 Mean :241.3 Mean : 55.91 Mean : 95.63 Mean :499.0 Mean : 709.3 Mean :123.7 Mean : 52.32 Mean :62.37 Mean : 1813 Mean :102.1 Mean : 552.4 Mean : 799.7 Mean : 249.7 Mean :146.1
3rd Qu.:1548 3rd Qu.:278.5 3rd Qu.: 72.00 3rd Qu.:135.50 3rd Qu.:565.5 3rd Qu.: 912.0 3rd Qu.:151.8 3rd Qu.: 63.00 3rd Qu.:67.50 3rd Qu.: 1681 3rd Qu.:142.5 3rd Qu.: 606.5 3rd Qu.: 938.0 3rd Qu.: 252.0 3rd Qu.:164.0
Max. :2170 Max. :376.0 Max. :155.00 Max. :242.00 Max. :792.0 Max. :1268.0 Max. :580.0 Max. :154.00 Max. :96.00 Max. :22768 Max. :336.0 Max. :2008.0 Max. :9963.0 Max. :1568.0 Max. :204.0
NA NA NA NA NA NA’s :18 NA’s :13 NA’s :87 NA’s :240 NA NA NA NA’s :18 NA NA’s :31
skim(mlb_test)
Data summary
Name mlb_test
Number of rows 259
Number of columns 15
_______________________
Column type frequency:
numeric 15
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
TEAM_BATTING_H 0 1.00 1469.39 150.66 819 1387.0 1455.0 1548.00 2170 ▁▂▇▁▁
TEAM_BATTING_2B 0 1.00 241.32 49.52 44 210.0 239.0 278.50 376 ▁▂▇▇▂
TEAM_BATTING_3B 0 1.00 55.91 27.14 14 35.0 52.0 72.00 155 ▇▇▃▁▁
TEAM_BATTING_HR 0 1.00 95.63 56.33 0 44.5 101.0 135.50 242 ▆▅▇▃▁
TEAM_BATTING_BB 0 1.00 498.96 120.59 15 436.5 509.0 565.50 792 ▁▁▅▇▁
TEAM_BATTING_SO 18 0.93 709.34 243.11 0 545.0 686.0 912.00 1268 ▁▃▇▇▂
TEAM_BASERUN_SB 13 0.95 123.70 93.39 0 59.0 92.0 151.75 580 ▇▃▁▁▁
TEAM_BASERUN_CS 87 0.66 52.32 23.10 0 38.0 49.5 63.00 154 ▂▇▃▁▁
TEAM_BATTING_HBP 240 0.07 62.37 12.71 42 53.5 62.0 67.50 96 ▃▇▅▁▁
TEAM_PITCHING_H 0 1.00 1813.46 1662.91 1155 1426.5 1515.0 1681.00 22768 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 102.15 57.65 0 52.0 104.0 142.50 336 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 552.42 172.95 136 471.0 526.0 606.50 2008 ▆▇▁▁▁
TEAM_PITCHING_SO 18 0.93 799.67 634.31 0 613.0 745.0 938.00 9963 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 249.75 230.90 73 131.0 163.0 252.00 1568 ▇▁▁▁▁
TEAM_FIELDING_DP 31 0.88 146.06 25.88 69 131.0 148.0 164.00 204 ▁▂▇▇▂
mlb_training %>%
  gather(variable, value, TARGET_WINS:TEAM_FIELDING_DP) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "blue", color="blue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

ggplot(gather(mlb_training), aes(value)) + 
    geom_histogram(bins = 8) + 
    facet_wrap(~key, scales = 'free_x')

The density and histogram show some of the features with positive skewness to the right. TEAM_PITCHING_BB, TEAM_PITCHING_H, TEAM_PITCHING_SO, and TEAM_FIELDING_E have distinct skewness to the right.TEAM_BATTING_SO, TEAM_BATTING_2B, and TEAM_FIELDING_DP appear to show a normal distribution.

mlb_training %>% 
  ggplot(aes(TARGET_WINS)) + 
  geom_histogram(bins = 50, fill = 'blue', color="black",) +
  geom_vline(aes(xintercept = mean(TARGET_WINS, na.rm = T)), col = "red", lty = 2) +
  geom_vline(aes(xintercept = median(TARGET_WINS, na.rm = T)), col = "yellow", lty = 2) +
  labs(x = element_blank(),
       y = "Count",
       title = "Distribution of Wins",
       caption = "* Red line is the mean value and yellow is the median") + 
  theme_classic()

The TARGET_WINS column follows a normal distribution. This will be important when deciding which transformation method to use. Since TARGET_WINS will be our dependent variable, using a Box-Cox transformation may not be the best method to use since our dependent variable already follows a normal distribution.

mlb_training %>%
  gather(-TARGET_WINS, key = "var", value = "value") %>% 
  ggplot(aes(x = value, y = TARGET_WINS)) +
    facet_wrap(~ var, scales = "free") +
    geom_point(fill = "blue", color="blue") +
    geom_smooth(method = "lm", se = FALSE, color = "black") + 
  labs(x = element_blank(), y = "Wins")

In terms of the relationship between each independent variable and the dependent variable TARGET_WINS, the variables that appear to show a linear relationship are TEAM_BATTING_BB, TEAM_BATTING_H, and TEAM_BATTING_2B. Variables

ggplot(stack(mlb_training), aes(x = ind, y = values)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 5000))+
  labs(x = element_blank(), y = element_blank()) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Measuring Correlation of Features

cor_matrix <- mlb_training %>% 
  cor(., use = "complete.obs") 
ggcorrplot::ggcorrplot(cor_matrix, type = "lower",
          lab = TRUE, lab_size = 2.1, tl.cex = 8)

cor_matrix[lower.tri(cor_matrix, diag=TRUE)] <- ""
cor_matrix <- cor_matrix %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  gather(Variable, Correlation, -rowname) %>%
  filter(Variable != rowname) %>%
  filter(Correlation != "") %>%
  mutate(Correlation = as.numeric(Correlation)) %>%
  rename(` Variable` = rowname) %>%
  arrange(desc(abs(Correlation))) 
cor_matrix %>%
  filter(abs(Correlation) > .5) %>%
  kable() %>%
  kable_styling()
Variable Variable Correlation
TEAM_BATTING_HR TEAM_PITCHING_HR 0.9999326
TEAM_BATTING_BB TEAM_PITCHING_BB 0.9998814
TEAM_BATTING_SO TEAM_PITCHING_SO 0.9997684
TEAM_BATTING_H TEAM_PITCHING_H 0.9991927
TEAM_BASERUN_SB TEAM_BASERUN_CS 0.6247378
TEAM_BATTING_H TEAM_BATTING_2B 0.5617729
TEAM_BATTING_2B TEAM_PITCHING_H 0.5604535

There were high correlation coefficienys between batting and pitching stats that were measuring the same feature. For instance, homeruns (HR), Walks (BB), Strikeouts (SO), and Hits (H) for batting and pitching were highly correlated with their respective feature. With the exception of strikeouts, the same stats have the highest correlation coefficient with wins.

Missing Data

mlb_training %>% 
  gather(variable, value) %>%
  filter(is.na(value)) %>%
  group_by(variable) %>%
  tally() %>%
  mutate(percent = n / nrow(mlb_training) * 100) %>%
  mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
  arrange(desc(n)) %>%
  rename(`Variable Missing Data` = variable,
         `Number of Records` = n,
         `Share of Total` = percent) %>%
  kable(caption="<center>Missing Training Data Count and Percentage", align = "c") %>% 
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Missing Training Data Count and Percentage
Variable Missing Data Number of Records Share of Total
TEAM_BATTING_HBP 2085 92%
TEAM_BASERUN_CS 772 34%
TEAM_FIELDING_DP 286 13%
TEAM_BASERUN_SB 131 5.8%
TEAM_BATTING_SO 102 4.5%
TEAM_PITCHING_SO 102 4.5%
mlb_test %>% 
  gather(variable, value) %>%
  filter(is.na(value)) %>%
  group_by(variable) %>%
  tally() %>%
  mutate(percent = n / nrow(mlb_test) * 100) %>%
  mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
  arrange(desc(n)) %>%
  rename(`Variable Missing Data` = variable,
         `Number of Records` = n,
         `Share of Total` = percent) %>%
  kable(caption="<center>Missing Test Data Count and Percentage", align = "c") %>% 
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Missing Test Data Count and Percentage
Variable Missing Data Number of Records Share of Total
TEAM_BATTING_HBP 240 93%
TEAM_BASERUN_CS 87 34%
TEAM_FIELDING_DP 31 12%
TEAM_BATTING_SO 18 6.9%
TEAM_PITCHING_SO 18 6.9%
TEAM_BASERUN_SB 13 5%

The amount of missing values in the training and test datasets for TEAM_BATTING_HBP is exceptionally large, missing 92% and 93% of its values, respectively. If we imputed the missing values for an amount that large, the results would lack natural variation that could result in an effective model. Combined with their low correlation to TARGET_WINS, we decided to drop the column in the training and test data. For the other variables with missing values, we used a K-Nearest Neighbor function from the VIM package to impute the missing values.

Flaws of Imputing TEAM_BATTING_HBP

Dropping HBP was a difficult decision to make, considering HBP is used to calculate On-Base Percentage and At-Bats (AB), with AB also used to calculate Slugging Percentage and Batting Average. Having additional baseball statistics that are commonly used to evaluate team performance would have been helpful for this analysis. However, as you will see, the additional statistics that were produced when imputing HBP created higher than normal stats, particularly with Batting Average. The highest team batting average ever was accomplished by the Philadelphia Phillies in 1894, which had a Batting Average of .350. The max of the Batting Average column is 0.9428571. Therefore, we can infer that after imputing HBP, the averages of the other statistics become inflated, and therefore unreliable when predicting the number of wins.

set.seed(123)
mlb_train_imp2 <- mlb_training %>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO", "TEAM_BATTING_HBP"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
set.seed(123)
mlb_train_imp2 <- mlb_train_imp2 %>%
  dplyr::mutate(TEAM_BATTING_1B = TEAM_BATTING_H - dplyr::select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE)) %>%
  dplyr::mutate(TEAM_BATTING_AB = TEAM_BATTING_H + TEAM_PITCHING_BB + TEAM_BATTING_SO + TEAM_BATTING_HBP) %>%
  dplyr::mutate(TEAM_BATTING_AVERAGE = TEAM_BATTING_H/TEAM_BATTING_AB) %>%
  dplyr::mutate(TEAM_BATTING_OBP = (TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP)/(TEAM_BATTING_AB + TEAM_BATTING_BB + TEAM_BATTING_HBP)) %>%
  dplyr::mutate(TEAM_BATTING_SLG = (TEAM_BATTING_1B + 2*TEAM_BATTING_2B + 3*TEAM_BATTING_3B + 4*TEAM_BATTING_HR)/TEAM_BATTING_AB) %>%
  relocate(TEAM_BATTING_1B, .before = TEAM_BATTING_2B) 
reactable(mlb_train_imp2)
max(mlb_train_imp2$TEAM_BATTING_AVERAGE)
## [1] 0.9428571

Dropping TEAM_BATTING_HBP and Imputing Missing Values

After deciding to drop HBP, we decided to use the K-Nearest Neighbors function kNN() from the VIM package to impute the missing data. Using KNN is advantageous because it considers the relationships between observations, leading to more accurate imputations than simpler methods like mean or mode imputation.

Training

set.seed(123)
mlb_training_no_hbp <- mlb_training %>%
  dplyr::select(-TEAM_BATTING_HBP)
set.seed(123)
mlb_train_imp <- mlb_training_no_hbp %>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
reactable(mlb_train_imp)
sum(is.na(mlb_train_imp))
## [1] 0
sum(is.nan(as.matrix(mlb_train_imp)))
## [1] 0
sum(is.infinite(as.matrix(mlb_train_imp)))
## [1] 0

Test Set

set.seed(123)
mlb_test_no_hbp <- mlb_test %>%
  dplyr::select(-TEAM_BATTING_HBP)
set.seed(123)
mlb_test_imp <- mlb_test_no_hbp %>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
reactable(mlb_test_imp)
sum(is.na(mlb_test_imp))
## [1] 0
sum(is.nan(as.matrix(mlb_test_imp)))
## [1] 0
sum(is.infinite(as.matrix(mlb_test_imp)))
## [1] 0

Data Preparation

When analyzing the dataset, we realized that the number of singles (1B) were not a feature. We can deduce that the total number of homeruns (HR), triples (3B), and doubles (2B) can be subtracted from the total number of hits in order to get the total amount of singles.

mlb_train_imp <- mlb_train_imp %>%
  dplyr::mutate(TEAM_BATTING_1B = TEAM_BATTING_H - dplyr::select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE)) %>%
  relocate(TEAM_BATTING_1B, .before = TEAM_BATTING_2B)
mlb_train_imp %>% 
  ggplot(aes(TEAM_BATTING_1B)) + 
  geom_histogram(bins = 50, fill = 'blue', color="black",) +
  geom_vline(aes(xintercept = mean(TEAM_BATTING_1B, na.rm = T)), col = "red", lty = 2) +
  geom_vline(aes(xintercept = median(TEAM_BATTING_1B, na.rm = T)), col = "yellow", lty = 2) +
  labs(x = element_blank(),
       y = "Count",
       title = "Distribution of Singles",
       caption = "* Red line is the mean value and yellow is the median") + 
  theme_classic()

The number of singles shows a positive right skewness. Let’s take a look at the correlation matrix that includes the new feature as well as the imputed data:

cor_matrix2 <- mlb_train_imp %>% 
  cor(., use = "complete.obs") 
ggcorrplot::ggcorrplot(cor_matrix2, type = "lower",
          lab = TRUE, lab_size = 2.1, tl.cex = 8)

Batting hits had the highest correlation coefficient with TARGET_WINS, but had decreased from the initial correlation matrix. The new variable for singles had a correlation coefficient of 0.22 with TARGET_WINS, which was 4th-highest behind batting hits, batting doubles, and batting walks.

Log-Transforming Variables

Since transforming the variables before imputing missing values helps preserve the relationships between the variables in the regression model, we decided to transform the data first before using our KNN imputation method.

The independent variables that aren’t normally distributed show positive skewness, and for this reason we decided to use a log-transformation method.

Training Set

mlb_training_no_hbp <- mlb_training_no_hbp %>%
  dplyr::mutate(TEAM_BATTING_1B = TEAM_BATTING_H - dplyr::select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE)) %>%
  relocate(TEAM_BATTING_1B, .before = TEAM_BATTING_2B)
mlb_train_log <- log(mlb_training_no_hbp)
set.seed(123)
mlb_train_log <- mlb_train_log%>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
sum(is.na(mlb_train_log))
## [1] 0
sum(is.nan(as.matrix(mlb_train_log)))
## [1] 0
sum(is.infinite(as.matrix(mlb_train_log)))
## [1] 271

After log-transforming the data, there were a considerable amount of infinite values created. To address this, we turned those values into NA values and imputed the data:

mlb_train_log[sapply(mlb_train_log, is.infinite)] <- NA
set.seed(123)
mlb_train_log <- mlb_train_log %>%
  kNN(k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
mlb_train_log %>%
  gather(-TARGET_WINS, key = "var", value = "value") %>% 
  ggplot(aes(x = value, y = TARGET_WINS)) +
    facet_wrap(~ var, scales = "free") +
    geom_point(fill = "blue", color="blue") +
    geom_smooth(method = "lm", se = FALSE, color = "black") + 
  labs(x = element_blank(), y = "Wins")

ggplot(gather(mlb_train_log), aes(value)) + 
    geom_histogram(bins = 8) + 
    facet_wrap(~key, scales = 'free_x')

mlb_train_log %>%
  gather(variable, value, TARGET_WINS:TEAM_FIELDING_DP) %>%
  ggplot(., aes(value)) + 
  geom_density(fill = "blue", color="blue") + 
  facet_wrap(~variable, scales ="free", ncol = 4) +
  labs(x = element_blank(), y = element_blank())

After log-transforming and imputing the data,

Test Set

mlb_test_no_hbp <- mlb_test_no_hbp %>%
  dplyr::mutate(TEAM_BATTING_1B = TEAM_BATTING_H - dplyr::select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE)) %>%
  relocate(TEAM_BATTING_1B, .before = TEAM_BATTING_2B)
mlb_test_log <- log(mlb_test_no_hbp)
set.seed(123)
mlb_test_log <- mlb_test_log%>%
  kNN(variable = c("TEAM_BASERUN_CS", "TEAM_FIELDING_DP", "TEAM_BASERUN_SB", "TEAM_BATTING_SO", "TEAM_PITCHING_SO"),
      k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
sum(is.na(mlb_test_log))
## [1] 0
sum(is.nan(as.matrix(mlb_test_log)))
## [1] 0
sum(is.infinite(as.matrix(mlb_test_log)))
## [1] 10
mlb_test_log[sapply(mlb_test_log, is.infinite)] <- NA
set.seed(123)
mlb_test_log <- mlb_test_log %>%
  kNN(k = 5, numFun = weighted.mean, weightDist = TRUE, imp_var = FALSE)
sum(is.na(mlb_test_log))
## [1] 0
sum(is.nan(as.matrix(mlb_test_log)))
## [1] 0
sum(is.infinite(as.matrix(mlb_test_log)))
## [1] 0
ggplot(gather(mlb_test_log), aes(value)) + 
    geom_histogram(bins = 10) + 
    facet_wrap(~key, scales = 'free_x')

Model Building

Model 1 - Using findCorrelation function from caret package

For our first model, we utilized the findCorrelation function to determine which variables to use in our model:

set.seed(123)
highlyCorDescr <- findCorrelation(cor(mlb_train_imp), cutoff = .50, verbose = TRUE, names=TRUE)
## Compare row 6  and column  12 with corr  0.969 
##   Means:  0.445 vs 0.299 so flagging column 6 
## Compare row 12  and column  8 with corr  0.664 
##   Means:  0.385 vs 0.28 so flagging column 12 
## Compare row 8  and column  15 with corr  0.585 
##   Means:  0.361 vs 0.264 so flagging column 8 
## Compare row 15  and column  3 with corr  0.548 
##   Means:  0.351 vs 0.25 so flagging column 15 
## Compare row 3  and column  5 with corr  0.6 
##   Means:  0.319 vs 0.234 so flagging column 3 
## Compare row 5  and column  9 with corr  0.545 
##   Means:  0.27 vs 0.223 so flagging column 5 
## Compare row 10  and column  16 with corr  0.537 
##   Means:  0.23 vs 0.21 so flagging column 10 
## Compare row 2  and column  4 with corr  0.563 
##   Means:  0.23 vs 0.205 so flagging column 2 
## All correlations <= 0.5
set.seed(123)
keep_these <- names(mlb_train_imp)[!(names(mlb_train_imp) %in% colnames(mlb_train_imp)[highlyCorDescr])]
mlb_train_features <- mlb_train_imp[, keep_these]
reactable(mlb_train_features)
set.seed(123)
m1 <- lm(TARGET_WINS ~., data = mlb_train_features)
summary(m1)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = mlb_train_features)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.566  -8.345   0.303   8.189  52.052 
## 
## Coefficients: (1 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      28.6728210  5.3947533   5.315 1.17e-07 ***
## TEAM_BATTING_H    0.1237754  0.0266112   4.651 3.49e-06 ***
## TEAM_BATTING_1B  -0.0786938  0.0265437  -2.965 0.003062 ** 
## TEAM_BATTING_2B  -0.0942028  0.0280605  -3.357 0.000801 ***
## TEAM_BATTING_3B  -0.0566817  0.0283020  -2.003 0.045324 *  
## TEAM_BATTING_HR          NA         NA      NA       NA    
## TEAM_BATTING_BB   0.0161483  0.0056273   2.870 0.004147 ** 
## TEAM_BATTING_SO  -0.0171662  0.0025251  -6.798 1.35e-11 ***
## TEAM_BASERUN_SB   0.0453571  0.0044947  10.091  < 2e-16 ***
## TEAM_BASERUN_CS   0.0401036  0.0108571   3.694 0.000226 ***
## TEAM_PITCHING_H   0.0002165  0.0003664   0.591 0.554753    
## TEAM_PITCHING_HR  0.0221576  0.0234035   0.947 0.343859    
## TEAM_PITCHING_BB -0.0032305  0.0040037  -0.807 0.419823    
## TEAM_PITCHING_SO  0.0022881  0.0008826   2.592 0.009590 ** 
## TEAM_FIELDING_E  -0.0291447  0.0026099 -11.167  < 2e-16 ***
## TEAM_FIELDING_DP -0.1329713  0.0136476  -9.743  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.56 on 2261 degrees of freedom
## Multiple R-squared:  0.3677, Adjusted R-squared:  0.3638 
## F-statistic: 93.92 on 14 and 2261 DF,  p-value: < 2.2e-16

The initial results produced an Adjusted R-squared of 0.1857, meaning that only 0.1857 of the variance in wins can be explained by the variables in the model. Two variables, TEAM_PITCHING_H and TEAM_PITCHING_BB, had p-values greater than 0.05, which indicates that each feature does not have statistical significance when explaining the variance of wins. We removed both variables and re-ran the model:

set.seed(123)
m1_train_revised <- lm(TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_FIELDING_DP, data = mlb_train_features)
summary(m1_train_revised)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_BB + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_FIELDING_DP, data = mlb_train_features)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.836  -8.987   0.189   9.450  64.404 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      50.9419986  2.4345521  20.925  < 2e-16 ***
## TEAM_BATTING_2B   0.1052782  0.0067785  15.531  < 2e-16 ***
## TEAM_BATTING_BB   0.0340953  0.0027860  12.238  < 2e-16 ***
## TEAM_BASERUN_SB   0.0308966  0.0036161   8.544  < 2e-16 ***
## TEAM_PITCHING_SO -0.0029254  0.0005523  -5.297 1.29e-07 ***
## TEAM_FIELDING_DP -0.0992912  0.0140364  -7.074 2.00e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.22 on 2270 degrees of freedom
## Multiple R-squared:  0.1873, Adjusted R-squared:  0.1856 
## F-statistic: 104.7 on 5 and 2270 DF,  p-value: < 2.2e-16

In the revised model, the Adjusted R-squared decreased by 0.01 to 0.1856. The p-value of the overall model is the same as the initial model, with a value below 0.05. The residual standard error increased by 0.01 to 14.22. The F-statistic increased from 75.12 to 104.7, which may indicate that removing the two variables with high p-values helped improve the ability of the predictors to explain the variability in wins

ggplot(m1_train_revised, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title="Residual vs. Fitted Values Plot") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m1_train_revised, aes(x = m1_train_revised$residuals)) +
    geom_histogram(bins = 10, fill = 'steelblue', color = 'black') +
    labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')

ggplot(data = m1_train_revised, aes(x = .resid)) +
  geom_histogram(binwidth = 0.4) +
  xlab("Residuals")

qqnorm(resid(m1_train_revised))
qqline(resid(m1_train_revised))

Model 2 - Using findCorrelation function for Log-Transformed Train Set

set.seed(123)
highlyCorDescr2 <- findCorrelation(cor(mlb_train_log), cutoff = .50, verbose = TRUE)
## Compare row 6  and column  15 with corr  0.78 
##   Means:  0.489 vs 0.325 so flagging column 6 
## Compare row 15  and column  12 with corr  0.659 
##   Means:  0.455 vs 0.305 so flagging column 15 
## Compare row 12  and column  8 with corr  0.522 
##   Means:  0.396 vs 0.282 so flagging column 12 
## Compare row 8  and column  3 with corr  0.801 
##   Means:  0.392 vs 0.263 so flagging column 8 
## Compare row 3  and column  5 with corr  0.593 
##   Means:  0.355 vs 0.247 so flagging column 3 
## Compare row 5  and column  14 with corr  0.563 
##   Means:  0.28 vs 0.237 so flagging column 5 
## Compare row 7  and column  11 with corr  0.685 
##   Means:  0.313 vs 0.219 so flagging column 7 
## Compare row 16  and column  9 with corr  0.544 
##   Means:  0.248 vs 0.198 so flagging column 16 
## Compare row 2  and column  4 with corr  0.578 
##   Means:  0.309 vs 0.175 so flagging column 2 
## All correlations <= 0.5
set.seed(123)
keep_these2 <- names(mlb_train_log)[!(names(mlb_train_log) %in% colnames(mlb_train_log)[highlyCorDescr2])]
mlb_train_features2 <- mlb_train_log[, keep_these2]
reactable(mlb_train_features2)
set.seed(123)
m2_log <- lm(TARGET_WINS ~., data = mlb_train_features2)
summary(m2_log)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = mlb_train_features2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.21655 -0.10935  0.02095  0.12652  0.65467 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.928902   0.206293  14.198  < 2e-16 ***
## TEAM_BATTING_2B   0.373551   0.022051  16.941  < 2e-16 ***
## TEAM_BASERUN_SB   0.064488   0.006880   9.373  < 2e-16 ***
## TEAM_BASERUN_CS   0.004051   0.009499   0.426     0.67    
## TEAM_PITCHING_H  -0.129390   0.014424  -8.971  < 2e-16 ***
## TEAM_PITCHING_BB  0.123832   0.017761   6.972 4.08e-12 ***
## TEAM_PITCHING_SO -0.111174   0.012877  -8.634  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1991 on 2269 degrees of freedom
## Multiple R-squared:  0.1906, Adjusted R-squared:  0.1884 
## F-statistic: 89.03 on 6 and 2269 DF,  p-value: < 2.2e-16

The initial results did not produce a trustworthy model. The Adjusted R-squared value was 18.84%, meaning the independent variables in the model can only account for 18.84% of the variance of TARGET_WINS. TEAM_BASERUN_CS was the only variable with a p-value greater than 0.05, which indicates that it is not statistically significant and could be affecting the model’s ability to determine the impact other variables may have on the explaining the variance of TARGET_WINS. We decided to re-run the model without the variable:

set.seed(123)
m2_log2 <- lm(TARGET_WINS ~ TEAM_BASERUN_SB + TEAM_BATTING_2B + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO, data = mlb_train_features2)
summary(m2_log2)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BASERUN_SB + TEAM_BATTING_2B + 
##     TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO, data = mlb_train_features2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.22005 -0.10941  0.02093  0.12710  0.65297 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.968279   0.184439  16.094  < 2e-16 ***
## TEAM_BASERUN_SB   0.065409   0.006531  10.016  < 2e-16 ***
## TEAM_BATTING_2B   0.372693   0.021955  16.976  < 2e-16 ***
## TEAM_PITCHING_H  -0.130712   0.014084  -9.281  < 2e-16 ***
## TEAM_PITCHING_BB  0.122628   0.017532   6.994 3.49e-12 ***
## TEAM_PITCHING_SO -0.112025   0.012719  -8.808  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1991 on 2270 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1887 
## F-statistic: 106.8 on 5 and 2270 DF,  p-value: < 2.2e-16

Removing the variable had a slight impact the Adjusted R-squared value, which shows an increase of 0.03 to 18.87%.

ggplot(m2_log2, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title="Residual vs. Fitted Values Plot") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m2_log2, aes(x = .resid)) +
  geom_histogram(binwidth = 0.04) +
  xlab("Residuals")

qqnorm(resid(m2_log2))
qqline(resid(m2_log2))

Model 3 - Using Recursive Feature Elimination on Imputed Train Set

set.seed(123)
filterCtrl <- rfeControl(functions=rfFuncs, method="cv", number=3)
results <- rfe(x= mlb_train_imp[,2:16],y= mlb_train_imp[,1], sizes=c(2:16), rfeControl=filterCtrl)
results
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (3 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables  RMSE Rsquared    MAE RMSESD RsquaredSD  MAESD Selected
##          2 14.17   0.2137 11.131 0.1632    0.03138 0.2706         
##          3 12.98   0.3209 10.263 0.2405    0.04370 0.2790         
##          4 12.73   0.3512 10.040 0.1634    0.03715 0.1218         
##          5 12.48   0.3842  9.825 0.2171    0.04292 0.1585         
##          6 12.00   0.4274  9.383 0.3385    0.05343 0.2741         
##          7 12.10   0.4205  9.515 0.2491    0.04685 0.2315         
##          8 12.08   0.4255  9.468 0.2882    0.05011 0.2251         
##          9 11.88   0.4419  9.290 0.2191    0.04224 0.2042         
##         10 11.90   0.4423  9.303 0.2775    0.04907 0.2164         
##         11 11.93   0.4430  9.347 0.2345    0.04506 0.2115         
##         12 11.85   0.4481  9.271 0.2436    0.04671 0.2161        *
##         13 11.90   0.4430  9.316 0.2755    0.04766 0.2605         
##         14 11.93   0.4416  9.327 0.2527    0.04601 0.2363         
##         15 11.87   0.4466  9.261 0.2487    0.04392 0.2278         
## 
## The top 5 variables (out of 12):
##    TEAM_FIELDING_E, TEAM_BATTING_H, TEAM_BASERUN_CS, TEAM_BATTING_BB, TEAM_PITCHING_SO
set.seed(123)
m3_train <- lm(TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_H + TEAM_BASERUN_CS + TEAM_BATTING_BB + TEAM_PITCHING_SO, data = mlb_train_imp)
summary(m3_train)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_H + 
##     TEAM_BASERUN_CS + TEAM_BATTING_BB + TEAM_PITCHING_SO, data = mlb_train_imp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.609  -8.786   0.103   9.000  50.032 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.347e+01  3.712e+00  -3.629 0.000291 ***
## TEAM_FIELDING_E  -1.618e-02  1.708e-03  -9.476  < 2e-16 ***
## TEAM_BATTING_H    5.447e-02  2.126e-03  25.624  < 2e-16 ***
## TEAM_BASERUN_CS   8.828e-02  8.222e-03  10.737  < 2e-16 ***
## TEAM_BATTING_BB   2.200e-02  3.109e-03   7.075 1.99e-12 ***
## TEAM_PITCHING_SO  1.731e-03  5.409e-04   3.199 0.001396 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.37 on 2270 degrees of freedom
## Multiple R-squared:  0.2809, Adjusted R-squared:  0.2793 
## F-statistic: 177.3 on 5 and 2270 DF,  p-value: < 2.2e-16
ggplot(m3_train, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title="Residual vs. Fitted Values Plot") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m3_train, aes(x = m3_train$residuals)) +
    geom_histogram(bins = 10, fill = 'steelblue', color = 'black') +
    labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')

ggplot(data = m3_train, aes(x = .resid)) +
  geom_histogram(binwidth = 0.4) +
  xlab("Residuals")

qqnorm(resid(m3_train))
qqline(resid(m3_train))

Model 4 - Using Recursive Feature Elimination for Log-Transformed Imputed Train Set

set.seed(123)
filterCtrl2 <- rfeControl(functions=rfFuncs, method="cv", number=3)
results2 <- rfe(x= mlb_train_log[,2:16],y= mlb_train_log[,1], sizes=c(2:16), rfeControl=filterCtrl2)
results2
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (3 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared    MAE   RMSESD RsquaredSD    MAESD Selected
##          2 0.1980   0.2139 0.1480 0.008784    0.03466 0.005485         
##          3 0.1798   0.3368 0.1343 0.005996    0.08552 0.005938         
##          4 0.1751   0.3781 0.1298 0.001170    0.06430 0.002825         
##          5 0.1748   0.3855 0.1289 0.001654    0.06539 0.001581         
##          6 0.1705   0.4096 0.1256 0.003244    0.05195 0.003023         
##          7 0.1709   0.4084 0.1256 0.004063    0.05415 0.002838         
##          8 0.1701   0.4184 0.1251 0.003443    0.05773 0.002629         
##          9 0.1700   0.4153 0.1253 0.004262    0.06991 0.003601         
##         10 0.1700   0.4175 0.1252 0.004390    0.07165 0.003373         
##         11 0.1696   0.4225 0.1253 0.003666    0.07539 0.003654         
##         12 0.1694   0.4223 0.1245 0.003572    0.06565 0.002294        *
##         13 0.1697   0.4195 0.1245 0.003871    0.06149 0.002530         
##         14 0.1697   0.4202 0.1247 0.003687    0.07013 0.003204         
##         15 0.1698   0.4156 0.1249 0.003739    0.06493 0.002897         
## 
## The top 5 variables (out of 12):
##    TEAM_FIELDING_E, TEAM_BATTING_H, TEAM_BASERUN_SB, TEAM_PITCHING_SO, TEAM_BATTING_BB
set.seed(123)
m4_log <- lm(TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_H + TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_BATTING_BB, data=mlb_train_log)
summary(m4_log)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E + TEAM_BATTING_H + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_SO + TEAM_BATTING_BB, data = mlb_train_log)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19699 -0.10778  0.00665  0.11636  0.80712 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.67180    0.40736  -6.559 6.70e-11 ***
## TEAM_FIELDING_E  -0.14556    0.01060 -13.736  < 2e-16 ***
## TEAM_BATTING_H    0.96713    0.04543  21.289  < 2e-16 ***
## TEAM_BASERUN_SB   0.10406    0.00710  14.656  < 2e-16 ***
## TEAM_PITCHING_SO -0.04167    0.01371  -3.040  0.00239 ** 
## TEAM_BATTING_BB   0.08924    0.01447   6.167 8.21e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1823 on 2270 degrees of freedom
## Multiple R-squared:  0.3215, Adjusted R-squared:  0.3201 
## F-statistic: 215.2 on 5 and 2270 DF,  p-value: < 2.2e-16
set.seed(123)
vip(m4_log)

ggplot(m4_log, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title="Residual vs. Fitted Values Plot") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = m4_log, aes(x = m4_log$residuals)) +
    geom_histogram(bins = 10, fill = 'steelblue', color = 'black') +
    labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')

ggplot(data = m4_log, aes(x = .resid)) +
  geom_histogram(binwidth = 0.2) +
  xlab("Residuals")

qqnorm(resid(m4_log))
qqline(resid(m4_log))

Model Selection

set.seed(123)
compare_performance(m1_train_revised, m2_log2, m3_train, m4_log)
## # Comparison of Model Performance Indices
## 
## Name             | Model |   AIC (weights) |  AICc (weights) |   BIC (weights) |    R2 | R2 (adj.) |   RMSE |  Sigma
## --------------------------------------------------------------------------------------------------------------------
## m1_train_revised |    lm | 18549.6 (<.001) | 18549.7 (<.001) | 18589.7 (<.001) | 0.187 |     0.186 | 14.197 | 14.216
## m2_log2          |    lm |  -879.5 (<.001) |  -879.5 (<.001) |  -839.4 (<.001) | 0.190 |     0.189 |  0.199 |  0.199
## m3_train         |    lm | 18271.3 (<.001) | 18271.3 (<.001) | 18311.4 (<.001) | 0.281 |     0.279 | 13.355 | 13.373
## m4_log           |    lm | -1281.5 (>.999) | -1281.5 (>.999) | -1241.4 (>.999) | 0.322 |     0.320 |  0.182 |  0.182

Based on the results from each model, we decided to go with the 4th model, m4_log, which produced the highest Adjusted R-squared at 32% and the lowest Root Mean Squared Error at 0.182. The final variables for the model are TEAM_FIELDING_E, TEAM_BATTING_H TEAM_BASERUN_SB, TEAM_PITCHING_SO, and TEAM_BATTING_BB

set.seed(123)
predictions <- predict(m4_log, newdata = mlb_test_log)
summary(predictions)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.490   4.296   4.378   4.362   4.448   4.740
set.seed(123)
mlb_test_log$predicted_wins <- round(exp(predictions))
set.seed(123)
mlb_test_log_final <- mlb_test_log %>%
  relocate(predicted_wins) 
reactable(mlb_test_log_final)

References